{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 前期准备工作"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from lxml.html import fromstring\n",
    "import time\n",
    "import base64\n",
    "import json\n",
    "import requests\n",
    "import os\n",
    "from random import random\n",
    "import requests_html\n",
    "from requests_html import HTMLSession\n",
    "from selenium import webdriver\n",
    "from selenium.webdriver.common.desired_capabilities import DesiredCapabilities\n",
    "from PIL import Image, ImageEnhance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-12-ef47dcab2233>:9: DeprecationWarning: use options instead of chrome_options\n",
      "  driver = webdriver.Chrome( chrome_options = opts) #desired_capabilities=caps,\n"
     ]
    }
   ],
   "source": [
    "opts = webdriver.ChromeOptions()\n",
    "opts.add_argument('--no-sandbox')#解决DevToolsActivePort文件不存在的报错\n",
    "opts.add_argument('window-size=1920x3000') #指定浏览器分辨率\n",
    "opts.add_argument('--disable-gpu') #谷歌文档提到需要加上一这个属性来规避bug\n",
    "opts.add_argument('--hide-scrollbars') #隐藏滚动条, 应对些特殊页面\n",
    "out_path = r'D:\\桌面\\Python worker space\\数据挖掘\\数据挖掘期末项目\\pdf文件夹'  # 是你想指定的路径\n",
    "prefs = {'profile.default_content_settings.popups': 0, 'download.default_directory': out_path}\n",
    "opts.add_experimental_option('prefs', prefs)\n",
    "driver = webdriver.Chrome( chrome_options = opts) #desired_capabilities=caps,"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 选择知网作为爬取对象"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 打开知网\n",
    "driver.get('https://cnki.net')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 打开高级检索\n",
    "element = driver.find_element_by_id('highSearch')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'CDwindow-AA975FEA559C974FA2DFC63B35E38F06'"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 检查窗口信息\n",
    "driver.current_window_handle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-16-7410291afb47>:2: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[1])\n"
     ]
    }
   ],
   "source": [
    "# 切换到高级检索窗口并再次检查\n",
    "driver.switch_to_window(driver.window_handles[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'CDwindow-EAF342ADF30E155582CAE0CD0CE07FBF'"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "driver.current_window_handle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 选择学术期刊\n",
    "element = driver.find_element_by_xpath('/html/body/div[3]/div[1]/div/ul[1]/li[1]/a')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 选择专业检索\n",
    "element = driver.find_element_by_xpath('/html/body/div[2]/div/div[2]/ul/li[4]')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 勾选来源\n",
    "element = driver.find_element_by_xpath('/html/body/div[2]/div/div[2]/div/div[1]/div[1]/div[2]/div[1]/div[3]/div/label[1]/input')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_xpath('/html/body/div[2]/div/div[2]/div/div[1]/div[1]/div[2]/div[1]/div[3]/div/label[2]/input')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_xpath('/html/body/div[2]/div/div[2]/div/div[1]/div[1]/div[2]/div[1]/div[3]/div/label[3]/input')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_xpath('/html/body/div[2]/div/div[2]/div/div[1]/div[1]/div[2]/div[1]/div[3]/div/label[4]/input')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_xpath('/html/body/div[2]/div/div[2]/div/div[1]/div[1]/div[2]/div[1]/div[3]/div/label[5]/input')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_xpath('/html/body/div[2]/div/div[2]/div/div[1]/div[1]/div[2]/div[1]/div[3]/div/label[6]/input')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 填入搜索内容\n",
    "query = 'SU = \"新媒体\" AND SU = “网络”'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_xpath('/html/body/div[2]/div/div[2]/div/div[1]/div[1]/div[2]/textarea')\n",
    "element.clear()\n",
    "element.send_keys(query)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 点击检索\n",
    "element = driver.find_element_by_xpath('/html/body/div[2]/div/div[2]/div/div[1]/div[1]/div[2]/div[2]/input')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'共找到<em>1,393</em>条结果'"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 检查信息与文章数量\n",
    "element = driver.find_element_by_xpath('//span[@class=\"pagerTitleCell\"]')\n",
    "element.get_attribute('innerHTML')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 更改页面文章数量\n",
    "element = driver.find_element_by_xpath('//*[@id=\"perPageDiv\"]/div/i')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_xpath('//*[@id=\"perPageDiv\"]/ul/li[3]')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>篇名</th>\n",
       "      <th>作者</th>\n",
       "      <th>刊名</th>\n",
       "      <th>发表时间</th>\n",
       "      <th>被引</th>\n",
       "      <th>下载</th>\n",
       "      <th>操作</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>加强新时代高校网络思想政治教育探究</td>\n",
       "      <td>高歌</td>\n",
       "      <td>学校党建与思想教育</td>\n",
       "      <td>2021-06-23</td>\n",
       "      <td>NaN</td>\n",
       "      <td>301</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>新媒体时代高校网络舆情引导机制探析</td>\n",
       "      <td>孙璐</td>\n",
       "      <td>新闻爱好者</td>\n",
       "      <td>2021-06-20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>98</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>新媒体环境下网络广告创意设计研究——评《新媒体时代下的网络广告设计应用》</td>\n",
       "      <td>赵静静</td>\n",
       "      <td>新闻爱好者</td>\n",
       "      <td>2021-06-20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>51</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>企业营销竞争力与消费者选择：社交网络营销的作用机理</td>\n",
       "      <td>周美</td>\n",
       "      <td>商业经济研究</td>\n",
       "      <td>2021-06-03</td>\n",
       "      <td>NaN</td>\n",
       "      <td>296</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>新媒体对高校思政教育的积极影响研究——评《高校网络思政教育平台的构建及其应用研究》</td>\n",
       "      <td>唐海玲</td>\n",
       "      <td>中国测试</td>\n",
       "      <td>2021-05-31</td>\n",
       "      <td>NaN</td>\n",
       "      <td>73</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6</td>\n",
       "      <td>“转发”行为的扩散与新媒体赋权——基于微博自闭症议题的社会网络分析</td>\n",
       "      <td>黄月琴; 黄宪成</td>\n",
       "      <td>新闻记者</td>\n",
       "      <td>2021-05-20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>766</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>7</td>\n",
       "      <td>新媒体视角下农产品网络营销策略创新研究</td>\n",
       "      <td>曹洁</td>\n",
       "      <td>农业经济</td>\n",
       "      <td>2021-05-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>869</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>8</td>\n",
       "      <td>嬗变、冲突与重构：新媒体视域下的网络舆论</td>\n",
       "      <td>陈晓伟; 董烁</td>\n",
       "      <td>中国编辑</td>\n",
       "      <td>2021-05-10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>407</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>9</td>\n",
       "      <td>广播电视和网络视听产业基地高质量发展策略研究</td>\n",
       "      <td>祝歆; 王森; 宋丽萍</td>\n",
       "      <td>中国广播电视学刊</td>\n",
       "      <td>2021-05-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>94</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>10</td>\n",
       "      <td>互联网背景下大学公共英语教学效率提升研究——评《网络与新媒体专业英语教程》</td>\n",
       "      <td>赵鹏</td>\n",
       "      <td>中国广播电视学刊</td>\n",
       "      <td>2021-05-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>49</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>11</td>\n",
       "      <td>网络时代传统媒体记者思维与角色的转换</td>\n",
       "      <td>初敏</td>\n",
       "      <td>青年记者</td>\n",
       "      <td>2021-04-30</td>\n",
       "      <td>NaN</td>\n",
       "      <td>66</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>12</td>\n",
       "      <td>新媒体时代图书网络营销矩阵建设实务研究</td>\n",
       "      <td>郑丽珠</td>\n",
       "      <td>出版广角</td>\n",
       "      <td>2021-04-30</td>\n",
       "      <td>NaN</td>\n",
       "      <td>134</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>13</td>\n",
       "      <td>嵌入基层治理：县级融媒体中心与基层网络政务服务的融合发展</td>\n",
       "      <td>谢新洲; 石林</td>\n",
       "      <td>传媒</td>\n",
       "      <td>2021-04-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>272</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>14</td>\n",
       "      <td>长城新媒体集团融合创新春晚形态——“河北网络春节云联欢”云端放异彩</td>\n",
       "      <td>李建; 田少华; 李遥</td>\n",
       "      <td>传媒</td>\n",
       "      <td>2021-04-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>34</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>15</td>\n",
       "      <td>新媒体视域下赣南脐橙网络营销策略优化研究</td>\n",
       "      <td>唐剑鸿</td>\n",
       "      <td>食品研究与开发</td>\n",
       "      <td>2021-04-20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>357</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>16</td>\n",
       "      <td>新媒体背景下学校对网络舆论场域的正向导引</td>\n",
       "      <td>胡杰</td>\n",
       "      <td>教学与管理</td>\n",
       "      <td>2021-04-20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>220</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>17</td>\n",
       "      <td>网络新媒体背景下铸造专业学术思政教育面临的挑战及对策</td>\n",
       "      <td>陈艳丽</td>\n",
       "      <td>特种铸造及有色合金</td>\n",
       "      <td>2021-04-20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>60</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>18</td>\n",
       "      <td>网络社交媒体情感动员的成因及策略</td>\n",
       "      <td>宋亮</td>\n",
       "      <td>青年记者</td>\n",
       "      <td>2021-03-30</td>\n",
       "      <td>NaN</td>\n",
       "      <td>125</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>19</td>\n",
       "      <td>网络问答社区科普质量评价研究——以“知乎”为例</td>\n",
       "      <td>周一杨</td>\n",
       "      <td>青年记者</td>\n",
       "      <td>2021-03-30</td>\n",
       "      <td>NaN</td>\n",
       "      <td>95</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>20</td>\n",
       "      <td>网络时代红色资源在高校思政课中的应用</td>\n",
       "      <td>范小青</td>\n",
       "      <td>学校党建与思想教育</td>\n",
       "      <td>2021-03-23</td>\n",
       "      <td>NaN</td>\n",
       "      <td>550</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>21</td>\n",
       "      <td>现状与特征:社会网络分析在我国传播学研究中的应用</td>\n",
       "      <td>瞿旭晟; 赵鹏程</td>\n",
       "      <td>新闻爱好者</td>\n",
       "      <td>2021-03-20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>434</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>22</td>\n",
       "      <td>从娱乐至死到网络至死:新媒体发展与城市生活状态</td>\n",
       "      <td>廖媌婧; 曾庆江</td>\n",
       "      <td>新闻爱好者</td>\n",
       "      <td>2021-03-20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>332</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>23</td>\n",
       "      <td>信息化时代我国网络政治生态治理研究</td>\n",
       "      <td>许开轶</td>\n",
       "      <td>理论学刊</td>\n",
       "      <td>2021-03-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>127</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>24</td>\n",
       "      <td>出圈与折叠：2020年网络热点事件的舆论特征及对内容生产的意义</td>\n",
       "      <td>周葆华</td>\n",
       "      <td>新闻界</td>\n",
       "      <td>2021-03-08 13:06</td>\n",
       "      <td>3.0</td>\n",
       "      <td>877</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>25</td>\n",
       "      <td>关于高校网络意识形态安全建设的新考量</td>\n",
       "      <td>潘红涛</td>\n",
       "      <td>学校党建与思想教育</td>\n",
       "      <td>2021-03-08</td>\n",
       "      <td>2.0</td>\n",
       "      <td>428</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>26</td>\n",
       "      <td>提升高校社会主义核心价值观网络传播效果研究</td>\n",
       "      <td>高蕾; 魏楚元; 王洋</td>\n",
       "      <td>传媒</td>\n",
       "      <td>2021-02-25</td>\n",
       "      <td>1.0</td>\n",
       "      <td>136</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>27</td>\n",
       "      <td>网络时代民族团结教育研究——铸牢中华民族共同体意识研究系列论文之一</td>\n",
       "      <td>王卓</td>\n",
       "      <td>广西民族研究</td>\n",
       "      <td>2021-02-20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>215</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>28</td>\n",
       "      <td>新媒体技术在英语教学与管理中的应用——评《基于网络多媒体的当代英语教学新探》</td>\n",
       "      <td>舒伟; 王若语</td>\n",
       "      <td>中国科技论文</td>\n",
       "      <td>2021-02-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>76</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>29</td>\n",
       "      <td>浅析广播电视新闻评论在网络媒体中的新常态运用</td>\n",
       "      <td>李节; 钟强</td>\n",
       "      <td>当代电视</td>\n",
       "      <td>2021-02-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>127</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>30</td>\n",
       "      <td>基于网络结构与内容分布的新媒体事件聚类研究</td>\n",
       "      <td>马昊; 马晓悦</td>\n",
       "      <td>现代情报</td>\n",
       "      <td>2021-02-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>165</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>31</td>\n",
       "      <td>社会网络及其在新媒体环境下对青少年吸烟行为的影响研究</td>\n",
       "      <td>戴珞佳; 谭银亮; 朱静芬</td>\n",
       "      <td>现代预防医学</td>\n",
       "      <td>2021-01-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>183</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31</th>\n",
       "      <td>32</td>\n",
       "      <td>新媒体时代公众参与网络信息治理的实现路径</td>\n",
       "      <td>魏小雨</td>\n",
       "      <td>新闻爱好者</td>\n",
       "      <td>2021-01-20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>128</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>33</td>\n",
       "      <td>提升高校网络育人成效的路径研究</td>\n",
       "      <td>丰硕</td>\n",
       "      <td>学校党建与思想教育</td>\n",
       "      <td>2021-01-18</td>\n",
       "      <td>NaN</td>\n",
       "      <td>509</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33</th>\n",
       "      <td>34</td>\n",
       "      <td>媒介素养研究核心议题:基于CSSCI期刊关键词网络分析</td>\n",
       "      <td>罗雁飞</td>\n",
       "      <td>中国出版</td>\n",
       "      <td>2021-01-16</td>\n",
       "      <td>NaN</td>\n",
       "      <td>571</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34</th>\n",
       "      <td>35</td>\n",
       "      <td>2020年网络新媒体传播:重大现实主题与学科研究进展</td>\n",
       "      <td>孟威</td>\n",
       "      <td>当代传播</td>\n",
       "      <td>2021-01-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>507</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35</th>\n",
       "      <td>36</td>\n",
       "      <td>社会责任将成网络平台的生命线</td>\n",
       "      <td>陈永东</td>\n",
       "      <td>青年记者</td>\n",
       "      <td>2021-01-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>166</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36</th>\n",
       "      <td>37</td>\n",
       "      <td>网络青年亚文化的特征及引领路径探析</td>\n",
       "      <td>谌韵灵; 邹升平</td>\n",
       "      <td>南通大学学报(社会科学版)</td>\n",
       "      <td>2021-01-15</td>\n",
       "      <td>1.0</td>\n",
       "      <td>977</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>37</th>\n",
       "      <td>38</td>\n",
       "      <td>新媒体传播视野下民间艺术的突围——二人转艺术元素网络直播认知</td>\n",
       "      <td>刘帅</td>\n",
       "      <td>戏剧文学</td>\n",
       "      <td>2021-01-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>113</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38</th>\n",
       "      <td>39</td>\n",
       "      <td>网络的法律地位:行政确认与《民法典》法律界定</td>\n",
       "      <td>陆小华</td>\n",
       "      <td>山西大学学报(哲学社会科学版)</td>\n",
       "      <td>2021-01-15</td>\n",
       "      <td>2.0</td>\n",
       "      <td>175</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>39</th>\n",
       "      <td>40</td>\n",
       "      <td>多媒体环境下社会主义核心价值观融入高校校园文化建设——评《新媒体时代议程设置嵌入高校网络思想...</td>\n",
       "      <td>刘静</td>\n",
       "      <td>中国科技论文</td>\n",
       "      <td>2021-01-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>227</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40</th>\n",
       "      <td>41</td>\n",
       "      <td>构建网络内容治理主体协同机制的作用与优化路径</td>\n",
       "      <td>谢新洲; 宋琢</td>\n",
       "      <td>新闻与写作</td>\n",
       "      <td>2021-01-05</td>\n",
       "      <td>1.0</td>\n",
       "      <td>411</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41</th>\n",
       "      <td>42</td>\n",
       "      <td>网络新媒体舆论的问题呈现和公共决策研究——评《网络新媒体舆论与公共决策——两个系统的互动研究》</td>\n",
       "      <td>马晓霞</td>\n",
       "      <td>中国广播电视学刊</td>\n",
       "      <td>2021-01-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>56</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42</th>\n",
       "      <td>43</td>\n",
       "      <td>介入·引入·打入——传统主流媒体时评引导、平衡网络舆论探析</td>\n",
       "      <td>范金刚</td>\n",
       "      <td>青年记者</td>\n",
       "      <td>2020-12-30</td>\n",
       "      <td>1.0</td>\n",
       "      <td>170</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>43</th>\n",
       "      <td>44</td>\n",
       "      <td>高校教育网络环境对大学生心理健康维护的影响研究——评《网络环境下大学生心理健康教育研究》</td>\n",
       "      <td>贺天庆</td>\n",
       "      <td>中国学校卫生</td>\n",
       "      <td>2020-12-25</td>\n",
       "      <td>1.0</td>\n",
       "      <td>243</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>44</th>\n",
       "      <td>45</td>\n",
       "      <td>网络综艺脱口秀的青年亚文化叙事</td>\n",
       "      <td>肖雪菁</td>\n",
       "      <td>青年记者</td>\n",
       "      <td>2020-12-20</td>\n",
       "      <td>2.0</td>\n",
       "      <td>489</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>45</th>\n",
       "      <td>46</td>\n",
       "      <td>互联网背景下网络和新媒体技术对文学发展的影响研究</td>\n",
       "      <td>潘裕仙</td>\n",
       "      <td>食品研究与开发</td>\n",
       "      <td>2020-12-20</td>\n",
       "      <td>1.0</td>\n",
       "      <td>185</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>46</th>\n",
       "      <td>47</td>\n",
       "      <td>大学生网络思想政治教育的范式演进与经验启示</td>\n",
       "      <td>梁钦; 蒲清平; 肖国芳</td>\n",
       "      <td>思想政治教育研究</td>\n",
       "      <td>2020-12-20</td>\n",
       "      <td>3.0</td>\n",
       "      <td>805</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>47</th>\n",
       "      <td>48</td>\n",
       "      <td>基于舆论引导的网络新闻传播规划——评《新闻传播学热点专题:知识图谱》</td>\n",
       "      <td>柳太江</td>\n",
       "      <td>中国油脂</td>\n",
       "      <td>2020-12-16</td>\n",
       "      <td>1.0</td>\n",
       "      <td>241</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48</th>\n",
       "      <td>49</td>\n",
       "      <td>新媒体产业资本流通与价值转移的影响机制研究——以网络视听行业为例</td>\n",
       "      <td>王建磊</td>\n",
       "      <td>新闻大学</td>\n",
       "      <td>2020-12-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>482</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49</th>\n",
       "      <td>50</td>\n",
       "      <td>新文科背景下“网络与新媒体”专业人才培养</td>\n",
       "      <td>张宏邦</td>\n",
       "      <td>青年记者</td>\n",
       "      <td>2020-12-10</td>\n",
       "      <td>3.0</td>\n",
       "      <td>519</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    Unnamed: 0                                                 篇名  \\\n",
       "0            1                                  加强新时代高校网络思想政治教育探究   \n",
       "1            2                                  新媒体时代高校网络舆情引导机制探析   \n",
       "2            3               新媒体环境下网络广告创意设计研究——评《新媒体时代下的网络广告设计应用》   \n",
       "3            4                          企业营销竞争力与消费者选择：社交网络营销的作用机理   \n",
       "4            5          新媒体对高校思政教育的积极影响研究——评《高校网络思政教育平台的构建及其应用研究》   \n",
       "5            6                  “转发”行为的扩散与新媒体赋权——基于微博自闭症议题的社会网络分析   \n",
       "6            7                                新媒体视角下农产品网络营销策略创新研究   \n",
       "7            8                               嬗变、冲突与重构：新媒体视域下的网络舆论   \n",
       "8            9                             广播电视和网络视听产业基地高质量发展策略研究   \n",
       "9           10              互联网背景下大学公共英语教学效率提升研究——评《网络与新媒体专业英语教程》   \n",
       "10          11                                 网络时代传统媒体记者思维与角色的转换   \n",
       "11          12                                新媒体时代图书网络营销矩阵建设实务研究   \n",
       "12          13                       嵌入基层治理：县级融媒体中心与基层网络政务服务的融合发展   \n",
       "13          14                  长城新媒体集团融合创新春晚形态——“河北网络春节云联欢”云端放异彩   \n",
       "14          15                               新媒体视域下赣南脐橙网络营销策略优化研究   \n",
       "15          16                               新媒体背景下学校对网络舆论场域的正向导引   \n",
       "16          17                         网络新媒体背景下铸造专业学术思政教育面临的挑战及对策   \n",
       "17          18                                   网络社交媒体情感动员的成因及策略   \n",
       "18          19                            网络问答社区科普质量评价研究——以“知乎”为例   \n",
       "19          20                                 网络时代红色资源在高校思政课中的应用   \n",
       "20          21                           现状与特征:社会网络分析在我国传播学研究中的应用   \n",
       "21          22                            从娱乐至死到网络至死:新媒体发展与城市生活状态   \n",
       "22          23                                  信息化时代我国网络政治生态治理研究   \n",
       "23          24                    出圈与折叠：2020年网络热点事件的舆论特征及对内容生产的意义   \n",
       "24          25                                 关于高校网络意识形态安全建设的新考量   \n",
       "25          26                              提升高校社会主义核心价值观网络传播效果研究   \n",
       "26          27                  网络时代民族团结教育研究——铸牢中华民族共同体意识研究系列论文之一   \n",
       "27          28             新媒体技术在英语教学与管理中的应用——评《基于网络多媒体的当代英语教学新探》   \n",
       "28          29                             浅析广播电视新闻评论在网络媒体中的新常态运用   \n",
       "29          30                              基于网络结构与内容分布的新媒体事件聚类研究   \n",
       "30          31                         社会网络及其在新媒体环境下对青少年吸烟行为的影响研究   \n",
       "31          32                               新媒体时代公众参与网络信息治理的实现路径   \n",
       "32          33                                    提升高校网络育人成效的路径研究   \n",
       "33          34                        媒介素养研究核心议题:基于CSSCI期刊关键词网络分析   \n",
       "34          35                         2020年网络新媒体传播:重大现实主题与学科研究进展   \n",
       "35          36                                     社会责任将成网络平台的生命线   \n",
       "36          37                                  网络青年亚文化的特征及引领路径探析   \n",
       "37          38                     新媒体传播视野下民间艺术的突围——二人转艺术元素网络直播认知   \n",
       "38          39                             网络的法律地位:行政确认与《民法典》法律界定   \n",
       "39          40  多媒体环境下社会主义核心价值观融入高校校园文化建设——评《新媒体时代议程设置嵌入高校网络思想...   \n",
       "40          41                             构建网络内容治理主体协同机制的作用与优化路径   \n",
       "41          42    网络新媒体舆论的问题呈现和公共决策研究——评《网络新媒体舆论与公共决策——两个系统的互动研究》   \n",
       "42          43                      介入·引入·打入——传统主流媒体时评引导、平衡网络舆论探析   \n",
       "43          44       高校教育网络环境对大学生心理健康维护的影响研究——评《网络环境下大学生心理健康教育研究》   \n",
       "44          45                                    网络综艺脱口秀的青年亚文化叙事   \n",
       "45          46                           互联网背景下网络和新媒体技术对文学发展的影响研究   \n",
       "46          47                              大学生网络思想政治教育的范式演进与经验启示   \n",
       "47          48                 基于舆论引导的网络新闻传播规划——评《新闻传播学热点专题:知识图谱》   \n",
       "48          49                   新媒体产业资本流通与价值转移的影响机制研究——以网络视听行业为例   \n",
       "49          50                               新文科背景下“网络与新媒体”专业人才培养   \n",
       "\n",
       "               作者               刊名              发表时间   被引   下载  操作  \n",
       "0              高歌        学校党建与思想教育        2021-06-23  NaN  301  下载  \n",
       "1              孙璐            新闻爱好者        2021-06-20  NaN   98  下载  \n",
       "2             赵静静            新闻爱好者        2021-06-20  NaN   51  下载  \n",
       "3              周美           商业经济研究        2021-06-03  NaN  296  下载  \n",
       "4             唐海玲             中国测试        2021-05-31  NaN   73  下载  \n",
       "5        黄月琴; 黄宪成             新闻记者        2021-05-20  NaN  766  下载  \n",
       "6              曹洁             农业经济        2021-05-15  NaN  869  下载  \n",
       "7         陈晓伟; 董烁             中国编辑        2021-05-10  NaN  407  下载  \n",
       "8     祝歆; 王森; 宋丽萍         中国广播电视学刊        2021-05-01  NaN   94  下载  \n",
       "9              赵鹏         中国广播电视学刊        2021-05-01  NaN   49  下载  \n",
       "10             初敏             青年记者        2021-04-30  NaN   66  下载  \n",
       "11            郑丽珠             出版广角        2021-04-30  NaN  134  下载  \n",
       "12        谢新洲; 石林               传媒        2021-04-25  NaN  272  下载  \n",
       "13    李建; 田少华; 李遥               传媒        2021-04-25  NaN   34  下载  \n",
       "14            唐剑鸿          食品研究与开发        2021-04-20  NaN  357  下载  \n",
       "15             胡杰            教学与管理        2021-04-20  NaN  220  下载  \n",
       "16            陈艳丽        特种铸造及有色合金        2021-04-20  NaN   60  下载  \n",
       "17             宋亮             青年记者        2021-03-30  NaN  125  下载  \n",
       "18            周一杨             青年记者        2021-03-30  NaN   95  下载  \n",
       "19            范小青        学校党建与思想教育        2021-03-23  NaN  550  下载  \n",
       "20       瞿旭晟; 赵鹏程            新闻爱好者        2021-03-20  NaN  434  下载  \n",
       "21       廖媌婧; 曾庆江            新闻爱好者        2021-03-20  NaN  332  下载  \n",
       "22            许开轶             理论学刊        2021-03-15  NaN  127  下载  \n",
       "23            周葆华              新闻界  2021-03-08 13:06  3.0  877  下载  \n",
       "24            潘红涛        学校党建与思想教育        2021-03-08  2.0  428  下载  \n",
       "25    高蕾; 魏楚元; 王洋               传媒        2021-02-25  1.0  136  下载  \n",
       "26             王卓           广西民族研究        2021-02-20  NaN  215  下载  \n",
       "27        舒伟; 王若语           中国科技论文        2021-02-15  NaN   76  下载  \n",
       "28         李节; 钟强             当代电视        2021-02-01  NaN  127  下载  \n",
       "29        马昊; 马晓悦             现代情报        2021-02-01  NaN  165  下载  \n",
       "30  戴珞佳; 谭银亮; 朱静芬           现代预防医学        2021-01-25  NaN  183  下载  \n",
       "31            魏小雨            新闻爱好者        2021-01-20  NaN  128  下载  \n",
       "32             丰硕        学校党建与思想教育        2021-01-18  NaN  509  下载  \n",
       "33            罗雁飞             中国出版        2021-01-16  NaN  571  下载  \n",
       "34             孟威             当代传播        2021-01-15  NaN  507  下载  \n",
       "35            陈永东             青年记者        2021-01-15  NaN  166  下载  \n",
       "36       谌韵灵; 邹升平    南通大学学报(社会科学版)        2021-01-15  1.0  977  下载  \n",
       "37             刘帅             戏剧文学        2021-01-15  NaN  113  下载  \n",
       "38            陆小华  山西大学学报(哲学社会科学版)        2021-01-15  2.0  175  下载  \n",
       "39             刘静           中国科技论文        2021-01-15  NaN  227  下载  \n",
       "40        谢新洲; 宋琢            新闻与写作        2021-01-05  1.0  411  下载  \n",
       "41            马晓霞         中国广播电视学刊        2021-01-01  NaN   56  下载  \n",
       "42            范金刚             青年记者        2020-12-30  1.0  170  下载  \n",
       "43            贺天庆           中国学校卫生        2020-12-25  1.0  243  下载  \n",
       "44            肖雪菁             青年记者        2020-12-20  2.0  489  下载  \n",
       "45            潘裕仙          食品研究与开发        2020-12-20  1.0  185  下载  \n",
       "46   梁钦; 蒲清平; 肖国芳         思想政治教育研究        2020-12-20  3.0  805  下载  \n",
       "47            柳太江             中国油脂        2020-12-16  1.0  241  下载  \n",
       "48            王建磊             新闻大学        2020-12-15  NaN  482  下载  \n",
       "49            张宏邦             青年记者        2020-12-10  3.0  519  下载  "
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 查看页面信息\n",
    "element = driver.find_element_by_id('gridTable')\n",
    "含有页面主要数据的表格_HTML = element.get_attribute('innerHTML')\n",
    "pd.read_html(含有页面主要数据的表格_HTML)[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 获取验证码"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 获取验证码\n",
    "def images_mining():\n",
    "    element = driver.find_element_by_xpath('//*[@id=\"vImg\"]')\n",
    "    element.click()\n",
    "    time.sleep(2)\n",
    "    screenImg = r\"D:\\桌面\\Python worker space\\数据挖掘\\数据挖掘期末项目\\验证码储存\\screenImg.png\"\n",
    "    driver.get_screenshot_as_file(screenImg)\n",
    "    block = driver.find_element_by_xpath('//*[@id=\"vImg\"]')\n",
    "    location = block.location\n",
    "    size = block.size\n",
    "    left = location['x']\n",
    "    top = 350\n",
    "    right = location['x'] + 330\n",
    "    bottom = 350 + size['height']\n",
    "    img = Image.open(screenImg).crop((left, top, right, bottom))\n",
    "    img = img.convert('RGBA')\n",
    "    img = img.convert('L')\n",
    "    img = ImageEnhance.Contrast(img)\n",
    "    img = img.enhance(2.0)\n",
    "    img.save(screenImg)\n",
    "    img = Image.open(screenImg)\n",
    "    return img"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "ename": "NoSuchElementException",
     "evalue": "Message: no such element: Unable to locate element: {\"method\":\"xpath\",\"selector\":\"//*[@id=\"vImg\"]\"}\n  (Session info: chrome=91.0.4472.114)\n",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mNoSuchElementException\u001b[0m                    Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-34-f02cc6c4cf5b>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mimages_mining\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[1;32m<ipython-input-33-67fe7a7a1b37>\u001b[0m in \u001b[0;36mimages_mining\u001b[1;34m()\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[1;31m# 获取验证码\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      2\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mimages_mining\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m     \u001b[0melement\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdriver\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfind_element_by_xpath\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'//*[@id=\"vImg\"]'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      4\u001b[0m     \u001b[0melement\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mclick\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      5\u001b[0m     \u001b[0mtime\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msleep\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m2\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Anaconda\\Anaconda\\lib\\site-packages\\selenium\\webdriver\\remote\\webdriver.py\u001b[0m in \u001b[0;36mfind_element_by_xpath\u001b[1;34m(self, xpath)\u001b[0m\n\u001b[0;32m    392\u001b[0m             \u001b[0melement\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdriver\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfind_element_by_xpath\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'//div/td[1]'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    393\u001b[0m         \"\"\"\n\u001b[1;32m--> 394\u001b[1;33m         \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfind_element\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mby\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mBy\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mXPATH\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mxpath\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    395\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    396\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0mfind_elements_by_xpath\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mxpath\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Anaconda\\Anaconda\\lib\\site-packages\\selenium\\webdriver\\remote\\webdriver.py\u001b[0m in \u001b[0;36mfind_element\u001b[1;34m(self, by, value)\u001b[0m\n\u001b[0;32m    974\u001b[0m                 \u001b[0mby\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mBy\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mCSS_SELECTOR\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    975\u001b[0m                 \u001b[0mvalue\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;34m'[name=\"%s\"]'\u001b[0m \u001b[1;33m%\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 976\u001b[1;33m         return self.execute(Command.FIND_ELEMENT, {\n\u001b[0m\u001b[0;32m    977\u001b[0m             \u001b[1;34m'using'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mby\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    978\u001b[0m             'value': value})['value']\n",
      "\u001b[1;32mD:\\Anaconda\\Anaconda\\lib\\site-packages\\selenium\\webdriver\\remote\\webdriver.py\u001b[0m in \u001b[0;36mexecute\u001b[1;34m(self, driver_command, params)\u001b[0m\n\u001b[0;32m    319\u001b[0m         \u001b[0mresponse\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcommand_executor\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexecute\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdriver_command\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    320\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mresponse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 321\u001b[1;33m             \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0merror_handler\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcheck_response\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mresponse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    322\u001b[0m             response['value'] = self._unwrap_value(\n\u001b[0;32m    323\u001b[0m                 response.get('value', None))\n",
      "\u001b[1;32mD:\\Anaconda\\Anaconda\\lib\\site-packages\\selenium\\webdriver\\remote\\errorhandler.py\u001b[0m in \u001b[0;36mcheck_response\u001b[1;34m(self, response)\u001b[0m\n\u001b[0;32m    240\u001b[0m                 \u001b[0malert_text\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'alert'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'text'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    241\u001b[0m             \u001b[1;32mraise\u001b[0m \u001b[0mexception_class\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmessage\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mscreen\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstacktrace\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0malert_text\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 242\u001b[1;33m         \u001b[1;32mraise\u001b[0m \u001b[0mexception_class\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmessage\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mscreen\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstacktrace\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    243\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    244\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m_value_or_default\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mobj\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdefault\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mNoSuchElementException\u001b[0m: Message: no such element: Unable to locate element: {\"method\":\"xpath\",\"selector\":\"//*[@id=\"vImg\"]\"}\n  (Session info: chrome=91.0.4472.114)\n"
     ]
    }
   ],
   "source": [
    "images_mining()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 图鉴api调用\n",
    "uname = \"ZhaoSu\"\n",
    "pwd = \"zhaosu123\"\n",
    "typeid  = \"3\"\n",
    "img = \"D:\\桌面\\Python worker space\\数据挖掘\\数据挖掘期末项目\\验证码储存\\screenImg.png\"\n",
    "def base64_api(uname, pwd, img, typeid):\n",
    "    with open(img, 'rb') as f:\n",
    "        base64_data = base64.b64encode(f.read())\n",
    "        b64 = base64_data.decode()\n",
    "    data = {\"username\": uname, \"password\": pwd, \"typeid\": typeid, \"image\": b64}\n",
    "    time.sleep(1)\n",
    "    result = json.loads(requests.post(\"http://api.ttshitu.com/predict\", json=data).text)\n",
    "    if result['success']:\n",
    "        return result[\"data\"][\"result\"]\n",
    "    else:\n",
    "        return result[\"message\"]\n",
    "    return \"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'dyb4'"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "base64_api(uname, pwd, img, typeid)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 验证码处理逻辑判断"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 判断是否跳出验证码图片\n",
    "def imageyes():\n",
    "    try:\n",
    "        driver.find_element_by_xpath('//*[@id=\"vImg\"]')\n",
    "        return True\n",
    "    except:\n",
    "        return False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 输入检测出的验证码\n",
    "def put():\n",
    "    imageid = base64_api(uname, pwd, img, typeid)\n",
    "    element = driver.find_element_by_xpath('//*[@id=\"vcode\"]')\n",
    "    element.clear()\n",
    "    element = driver.find_element_by_xpath('//*[@id=\"vcode\"]').send_keys(imageid)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 如果验证码输入错误\n",
    "def putno():\n",
    "    try:\n",
    "        driver.fine_element_by_xpath('//*[@id=\"tips\"]')\n",
    "        return True\n",
    "    except:\n",
    "        return False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 切换验证码界面\n",
    "def switch():\n",
    "    allwindows=driver.window_handles\n",
    "    newpage = len(allwindows)\n",
    "    driver.switch_to.window(driver.window_handles[newpage-1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 如果验证码输入正确\n",
    "def putyes():\n",
    "    switch()\n",
    "    driver.close()\n",
    "    driver.switch_to.window(driver.window_handles[2])\n",
    "    time.sleep(1)\n",
    "    switch()\n",
    "    driver.close()\n",
    "    driver.switch_to.window(driver.window_handles[1])\n",
    "    time.sleep(1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 文件是否下载成功判断逻辑"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 第一次判断文件下载数量\n",
    "def panduan1():\n",
    "    path = r'D:\\桌面\\Python worker space\\数据挖掘\\数据挖掘期末项目\\验证码储存\\pdf'\n",
    "    file1 = int(len([lists for lists in os.listdir(path) if os.path.isfile(os.path.join(path, lists))]))\n",
    "    return file1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 第二次判断文件下载数量\n",
    "def panduan2():\n",
    "    path = r'D:\\桌面\\Python worker space\\数据挖掘\\数据挖掘期末项目\\验证码储存\\pdf'\n",
    "    file2 = int(len([lists for lists in os.listdir(path) if os.path.isfile(os.path.join(path, lists))]))\n",
    "    return file2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 通过比较前后两次判断确认是否下载成功文件\n",
    "def panduan3():\n",
    "    if file2 > file1:\n",
    "        return True\n",
    "    else:\n",
    "        return False"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 下载pdf文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 组装好上述逻辑判断\n",
    "for page in range(1,51):\n",
    "    pdf_xpath = '//*[@id=\"gridTable\"]/table/tbody/tr[{}]/td[2]/a'.format(page)\n",
    "    element = driver.find_element_by_xpath(pdf_xpath)\n",
    "    element.click()\n",
    "    time.sleep(5)\n",
    "    driver.switch_to.window(driver.window_handles[2])\n",
    "    element = driver.find_element_by_xpath('//*[@id=\"pdfDown\"]')\n",
    "    element.click()\n",
    "    time.sleep(3)\n",
    "    switch()#切换到最新页面\n",
    "    time.sleep(5)\n",
    "    imageyes()#判断是否弹出验证码窗口\n",
    "    res = imageyes()\n",
    "    if res is True:\n",
    "        images_mining()#爬取验证码\n",
    "        time.sleep(5)\n",
    "        base64_api(uname, pwd, img, typeid)#调用api识别验证码内容\n",
    "        put()#填写识别出来的验证码内容\n",
    "        panduan1()#检测下载文件夹内的文件数量\n",
    "        file1 = panduan1()\n",
    "        time.sleep(1)\n",
    "        element = driver.find_element_by_xpath('/html/body/div/form/dl/dd/button')\n",
    "        element.click()\n",
    "        panduan2()#检测点击下载后文件夹内的文件数量\n",
    "        file2 = panduan2()\n",
    "        time.sleep(1)\n",
    "        panduan3()#判断下载文件夹内的文件数量是否变化\n",
    "        res2 = panduan3()\n",
    "        while res2 == False:\n",
    "            images_mining()#爬取验证码\n",
    "            base64_api(uname, pwd, img, typeid)#调用百度api识别验证码内容\n",
    "            put()#填写识别出来的验证码内容\n",
    "            panduan1()#检测下载文件夹内的文件数量\n",
    "            file1 = panduan1()\n",
    "            element = driver.find_element_by_xpath('/html/body/div/form/dl/dd/button')\n",
    "            element.click()\n",
    "            panduan2()\n",
    "            file2 = panduan2()\n",
    "            panduan3()#判断下载文件夹内的文件数量是否变化\n",
    "            res2 = panduan3()\n",
    "            if res2 == True:\n",
    "                break\n",
    "        putyes()    \n",
    "    else:\n",
    "        switch()\n",
    "        driver.close()\n",
    "        driver.switch_to.window(driver.window_handles[1])\n",
    "        time.sleep(1)\n",
    "driver.find_element_by_xpath('//*[@id=\"PageNext\"]').click()\n",
    "time.sleep(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.switch_to.window(driver.window_handles[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
